from Utils import *

df = pd.read_csv('../Social Bias Probing/Stereotypes-w-PPLs-aggr-filtered.csv')
print(df['keep'].value_counts())    
print(df)
df = df[df['keep'] == 1]
print(df)

all_targets = []
gender_df = pd.read_csv("identity_terms/gender.csv") # TERM,POS removing duplicates w.r.t. TERM 
sexuality_df = pd.read_csv("identity_terms/sexuality.csv") # TERM,POS removing duplicates w.r.t. TERM 
race_df = pd.read_csv("identity_terms/race.csv") # TERM, missing POS but they're adj
countries_df = pd.read_csv("identity_terms/countries.csv") # COUNTRY_ADJ and REGION_ADJ removing duplicates
religion_df = pd.read_csv("identity_terms/religion.csv") # TERM w SEM == person/"", POS removing duplicates w.r.t. TERM 
religion_df = religion_df[(religion_df["SEM"] == "person") | (religion_df["SEM"] == "")]
disability_df = pd.read_csv("identity_terms/disability.csv") # TERM,POS removing duplicates w.r.t. TERM  
gender_df,all_targets = filter(all_targets,gender_df)
sexuality_df,all_targets = filter(all_targets,sexuality_df)
religion_df,all_targets = filter(all_targets,religion_df)
disability_df,all_targets = filter(all_targets,disability_df,True)
regions_df,all_targets = filter_exceptions(all_targets,countries_df,False,False)
countries_df,all_targets = filter_exceptions(all_targets,countries_df,False,True)
race_df,all_targets = filter_exceptions(all_targets,race_df,True)
gender_sexuality_df = gender_df + sexuality_df
race_countries_df = race_df + regions_df + countries_df

data_dict = {
    "gender": [item for item in gender_sexuality_df if item != 'nan'],
    "race":  [item for item in race_countries_df if item != 'nan'],
    "culture":  [item for item in religion_df if item != 'nan'],
    "disabled":  [item for item in disability_df if item != 'nan'] 
}

all_targets = []

for key, value in data_dict.items():
    data_dict[key] = [item for item in value if item != "nan"]
    data_dict[key] = [x.capitalize() if isinstance(x, str) else x for x in data_dict[key]]
    all_targets += data_dict[key]

all_targets = [str(target).lower() for target in all_targets]

print(len(all_targets))

print(df['category'].value_counts())
nProbes = len(df)
print(nProbes)

df['identity'] = np.nan
df['probe'] = np.nan

for i, row in df.iterrows():
    for target in data_dict[row['category']]: 
        stereotype=row['stereotype']
        probe = f"{target} {stereotype}"
        new_row = row.copy()
        new_row['identity'] = target
        new_row['probe'] = probe
        df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True) 
df = df.drop(df.index[:nProbes]).reset_index(drop=True) 
new_order = ['id', 'category', 'target', 'identity', 'stereotype', 'probe']  
df = df[new_order]  
df.to_csv('../Social Bias Probing/SBIC-Pro.csv', index=False)
print(df)

print(df['category'].value_counts())

print(df['identity'].value_counts())

print('\n\n\n\n <----------------------> END of SBIC-Pro Generation \n\n\n\n')